https://kforthman.shinyapps.io/500citiescounties/
#
#remove scientific notation
options(scipen=999)
library(stringr)
library(corrplot)
## corrplot 0.84 loaded
library(shiny)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"
# Read in the data
US.deaths <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = T, stringsAsFactors = F)
US.cases <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = T, stringsAsFactors = F)
# Read in the header seprately.
US.cases.head <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = F, stringsAsFactors = F)[1,]
# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
dates <- sapply(dates, strsplit, split = "/")
dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
dates <- lapply(dates, paste, collapse = "_")
dates <- unlist(dates)
return(dates)
}
dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])
names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)
if(sum(US.cases$UID != US.deaths$UID, na.rm = T) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/"
daily_filenames <- list.files(data.path)
daily_filenames <- daily_filenames[daily_filenames != "README.md"]
todays_report_filename <- daily_filenames[length(daily_filenames)]
US.todaysReport <- read.csv(
paste0(data.path, todays_report_filename),
header = T, stringsAsFactors = F)
all.states <- c('Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia', 'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
all.states.df <- data.frame(Province_State = all.states)
all.stats <- c("Confirmed", "Deaths", "Recovered", "Active", "Incident_Rate", "People_Tested", "People_Hospitalized", "Mortality_Rate", "Testing_Rate", "Hospitalization_Rate")
compiled.stats <- list()
for(i in 1:length(daily_filenames)){
day <- substring(daily_filenames[i],1,10)
data <- read.csv(
paste0(data.path, daily_filenames[i]),
header = T, stringsAsFactors = F)
compiled.stats[[i]] <- merge(all.states.df, data, all.y = F)
names(compiled.stats)[i] <- day
}
plot.dailyStat <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
barplot(data, main = paste0(state, " ", stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
plot.dailyStatRise <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
rise.stat <- matrix(ncol = length(data) - 1, nrow = 1)
colnames(rise.stat) <- names(data)[-1]
for(i in 1:ncol(rise.stat) + 1){
rise <- data[i] - data[i-1]
rise.stat[i-1] <- rise
}
barplot(rise.stat, main = paste0(state, " rise in ",stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
testing.data.state <- compiled.stats[[length(daily_filenames)]][, c("Province_State", "Testing_Rate")]
testing.data.state <- testing.data.state[!is.na(testing.data.state$Testing_Rate),]
testing.data.state <- testing.data.state[order(testing.data.state$Testing_Rate),]
col.state <- rep("pink", nrow(testing.data.state))
avg.test.rate <- mean(testing.data.state$Testing_Rate, na.rm = T)
col.state[testing.data.state$Testing_Rate < avg.test.rate] <- "grey"
col.state[testing.data.state$Province_State == "Oklahoma"] <- "lightblue"
par(mar = c(5,6,4,2))
barplot(testing.data.state$Testing_Rate, names.arg = testing.data.state$Province_State, horiz = T, main = "Testing Rate by State", las = 2, cex.axis = 1, cex.names = 0.5, col = col.state, border = F, xlab = "Total number of people tested per 100,000 persons.")
abline(v = avg.test.rate, col = "red")
text(x = avg.test.rate + 10, y = 1, labels = "Average Testing Rate", adj = c(0, 0.5), col = "red")
Province_State - The name of the State within the USA. Country_Region - The name of the Country (US). Last_Update - The most recent date the file was pushed. Lat - Latitude. Long_ - Longitude. Confirmed - Aggregated confirmed case count for the state. Deaths - Aggregated Death case count for the state. Recovered - Aggregated Recovered case count for the state. Active - Aggregated confirmed cases that have not been resolved (Active = Confirmed - Recovered - Deaths). FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA. Incident_Rate - confirmed cases per 100,000 persons. People_Tested - Total number of people who have been tested. People_Hospitalized - Total number of people hospitalized. Mortality_Rate - Number recorded deaths * 100/ Number confirmed cases. UID - Unique Identifier for each row entry. ISO3 - Officialy assigned country code identifiers. Testing_Rate - Total number of people tested per 100,000 persons. Hospitalization_Rate - Total number of people hospitalized * 100/ Number of confirmed cases.
US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])
rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]
ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)
nobs <- nrow(US.cases.data)
state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data)
day.first.case <- min(which(data.sum > 0))
n.days <- length(data.sum)
if(logScale == T){
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"),
log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}else{
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state),
las = 2, cex.axis = 1, cex.names = 0.5)
}
}
state.rise <- function(state, stat = c("cases", "deaths")){
if(stat == "cases"){
data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data.thisState)
n.days <- ncol(data.thisState)
rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
colnames(rise.cases) <- colnames(data.thisState)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- data.sum[i] - data.sum[i-1]
rise.cases[i-1] <- rise
}
day.first.case <- min(which(rise.cases > 0))
n.days <- length(rise.cases)
barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve <- function(county, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Admin2 == county),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Admin2 == county),]
}
day.first.case <- min(which(data > 0))
n.days <- length(data)
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve("Tulsa", "cases")
county.curve("Tulsa", "deaths")
US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)
day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)
par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
deaths.total <- colSums(US.deaths.data)
day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
avg.rise.cases
rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- US.cases.data[,i] - US.cases.data[,i-1]
rise.cases[,i-1] <- rise
}
US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)
rise.cases.total <- colSums(rise.cases)
day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)
barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
avg.rise.deaths
rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
rise.deaths[,i-1] <- rise
}
US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)
rise.deaths.total <- colSums(rise.deaths)
day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)
barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
total.cases
US.stats$total.cases <- US.cases.data[,ndays.cases]
US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)
total.deaths
US.stats$total.deaths <- US.deaths.data[,ndays.deaths]
total.deaths.percap
US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA
max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.003114828
total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.
# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA
US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
## UID avg.rise.cases avg.rise.deaths total.cases
## 3155 84080008 0.00000000 0.02040816 0
## 3203 84090002 0.00000000 0.04081633 0
## 3204 84090004 0.00000000 0.02040816 0
## 3206 84090006 0.00000000 0.02040816 0
## 3222 84090024 0.00000000 0.88775510 0
## 3231 84090033 0.07142857 0.58163265 7
## 3252 84090056 0.00000000 0.06122449 0
## total.cases.percap total.deaths total.deaths.percap
## 3155 NA 2 NA
## 3203 NA 4 NA
## 3204 NA 2 NA
## 3206 NA 2 NA
## 3222 NA 87 NA
## 3231 NA 57 NA
## 3252 NA 6 NA
## total.deaths.percase
## 3155 NA
## 3203 NA
## 3204 NA
## 3206 NA
## 3222 NA
## 3231 8.142857
## 3252 NA
US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)
data.merge <- merge(US.stats, county_factors, by = "ID")
data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)
data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
US.todaysReport.states <- US.todaysReport[!is.na(US.todaysReport$FIPS) & nchar(US.todaysReport$FIPS)<=2,]
US.todaysReport.states$FIPS <- str_pad(US.todaysReport.states$FIPS, 2, "left", pad = "0")
data.merge2$stateID <- substr(data.merge2$ID,1,2)
data.merge3 <- merge(data.merge2, US.todaysReport.states, by.x = "stateID", by.y = "FIPS")
save(data.merge3, file = "county-Demo_and_Covid.Rda")
this.lme <- lmer("total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)", data = data.merge3)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## boundary (singular) fit: see ?isSingular
## Warning: Some predictor variables are on very different scales: consider
## rescaling
print(summary(this.lme), correlation=TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## "total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)"
## Data: data.merge3
##
## REML criterion at convergence: -1137.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6422 -0.3170 -0.0523 0.1875 6.2397
##
## Random effects:
## Groups Name Variance Std.Dev.
## stateID (Intercept) 0.00000000 0.000000
## Residual 0.00001103 0.003321
## Number of obs: 166, groups: stateID, 31
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) -0.0028776929 0.0079639162 145.9999999978
## Affluence 0.0032519579 0.0009746577 145.9999999975
## Singletons.in.Tract 0.0016392502 0.0008323092 145.9999999977
## Seniors.in.Tract 0.0008072173 0.0010716599 145.9999999973
## African.Americans.in.Tract 0.0002046973 0.0009118003 145.9999999983
## Noncitizens.in.Tract 0.0007055608 0.0006612254 145.9999999975
## High.BP 0.0002501847 0.0001609348 145.9999999978
## Binge.Drinking 0.0001559277 0.0001257104 145.9999999967
## Cancer -0.0009012765 0.0009274109 145.9999999968
## Asthma 0.0005396386 0.0004280705 145.9999999974
## Heart.Disease 0.0000908842 0.0010660956 145.9999999978
## COPD 0.0005263782 0.0009180567 145.9999999978
## Smoking -0.0002070385 0.0001928343 145.9999999966
## Diabetes -0.0002637802 0.0004539162 145.9999999980
## No.Physical.Activity -0.0000237576 0.0001734182 145.9999999969
## Obesity 0.0001107196 0.0001485761 145.9999999988
## Poor.Sleeping.Habits -0.0000057617 0.0001468974 145.9999999983
## Poor.Mental.Health -0.0002601499 0.0003315387 145.9999999976
## Testing_Rate 0.0000004842 0.0000003324 145.9999999978
## Hospitalization_Rate -0.0001580183 0.0000639040 145.9999999982
## t value Pr(>|t|)
## (Intercept) -0.361 0.71837
## Affluence 3.337 0.00108 **
## Singletons.in.Tract 1.970 0.05079 .
## Seniors.in.Tract 0.753 0.45252
## African.Americans.in.Tract 0.224 0.82268
## Noncitizens.in.Tract 1.067 0.28771
## High.BP 1.555 0.12221
## Binge.Drinking 1.240 0.21683
## Cancer -0.972 0.33275
## Asthma 1.261 0.20945
## Heart.Disease 0.085 0.93218
## COPD 0.573 0.56728
## Smoking -1.074 0.28475
## Diabetes -0.581 0.56206
## No.Physical.Activity -0.137 0.89122
## Obesity 0.745 0.45735
## Poor.Sleeping.Habits -0.039 0.96877
## Poor.Mental.Health -0.785 0.43392
## Testing_Rate 1.457 0.14728
## Hospitalization_Rate -2.473 0.01456 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of fixed effects could have been required in summary()
##
## Correlation of Fixed Effects:
## (Intr) Afflnc Sng..T Snr..T A.A..T Nnc..T Hgh.BP Bng.Dr Cancer
## Affluence 0.138
## Sngltns.n.T -0.017 0.043
## Snrs.n.Trct 0.603 0.363 0.163
## Afrcn.Am..T 0.227 0.141 -0.443 0.168
## Nnctzns.n.T -0.020 0.073 0.050 0.045 -0.063
## High.BP 0.036 0.228 0.101 0.135 -0.109 0.399
## Bing.Drnkng -0.180 -0.238 -0.309 -0.213 0.150 0.074 0.170
## Cancer -0.605 -0.169 0.174 -0.342 -0.097 -0.160 -0.420 -0.177
## Asthma -0.314 -0.203 -0.158 -0.092 0.080 0.090 0.179 -0.014 0.009
## Heart.Dises -0.143 0.060 -0.269 -0.152 0.229 -0.098 -0.042 0.062 -0.454
## COPD 0.548 -0.013 0.110 0.245 0.043 0.298 0.243 0.180 -0.266
## Smoking -0.231 0.103 -0.174 -0.133 -0.118 -0.028 -0.119 -0.290 0.104
## Diabetes 0.046 -0.276 -0.167 -0.232 -0.270 -0.340 -0.507 0.036 0.204
## N.Physcl.Ac -0.182 -0.023 0.097 -0.019 -0.032 -0.225 -0.157 0.029 0.521
## Obesity 0.047 0.454 0.380 0.320 0.173 0.238 -0.054 -0.237 0.107
## Pr.Slpng.Hb -0.532 -0.427 0.190 -0.411 -0.443 0.034 -0.178 0.050 0.199
## Pr.Mntl.Hlt -0.323 0.304 -0.073 -0.090 0.104 -0.234 -0.136 -0.058 0.313
## Testing_Rat 0.167 -0.168 -0.145 -0.076 0.064 -0.134 -0.013 0.093 -0.192
## Hsptlztn_Rt -0.174 -0.172 -0.202 -0.257 -0.106 -0.145 -0.153 -0.172 0.083
## Asthma Hrt.Ds COPD Smokng Diabts N.Ph.A Obesty Pr.S.H Pr.M.H
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises 0.274
## COPD -0.338 -0.559
## Smoking 0.038 0.219 -0.546
## Diabetes -0.140 -0.205 -0.193 0.321
## N.Physcl.Ac 0.019 -0.399 -0.024 -0.337 -0.083
## Obesity -0.290 -0.122 0.189 -0.221 -0.413 -0.043
## Pr.Slpng.Hb 0.092 0.237 -0.228 0.081 -0.023 -0.097 -0.191
## Pr.Mntl.Hlt -0.204 0.103 -0.486 0.124 0.054 0.120 0.103 -0.212
## Testing_Rat -0.370 0.013 0.156 0.212 0.198 -0.391 -0.001 -0.109 -0.109
## Hsptlztn_Rt -0.014 0.098 -0.151 0.169 0.140 -0.042 -0.196 0.061 0.116
## Tstn_R
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises
## COPD
## Smoking
## Diabetes
## N.Physcl.Ac
## Obesity
## Pr.Slpng.Hb
## Pr.Mntl.Hlt
## Testing_Rat
## Hsptlztn_Rt 0.299
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular
this.lme.sum <- summary(this.lme)